#Please swipe in!!!
#Welcome to R! R is an open source software that has tremendous flexibility and capability.
#Unfortunately some of this felxibility comes at the cost of a steeper learning curve, but have no fear!
#What's so great about R?
#1. The open source nature means it is capable of almost anything (it just passed 10,000 packages!)
#2. Accessible and easy to download, no license restrictions
#3. Powerful enough for complex computing tasks, like machine learning
#4. Graphics capabilities surpass many other programs
#5. Works well with other programs like SQL, Excel, and Stata
#Some disadvantages
#1. We mentioned the steep learning curve
#2. Source documentation can be bad at times
#3. Most people work on it voluntarily, so generally no one to "complain" to
#4. Packages can contain errors -- though if it's popular these tend to get solved quickly
#We'll walk through of the basics of coding in R.
#First we'll start with some simple mathematics operations
2+2
2^3
factorial(6)
#We've used our first function here. There's a ton to be said about functions,
#but for now we can see how to get help for a particular function
?factorial
#R uses the order of operations as well
3+4*(4^2)
#We can also test for equalities
2==3
2^2==4
#Though we should watch out for rounding error, because R likes to be exact
sqrt(2)^2 == 2
#To help, we can specify to a certain number of decimals
all.equal(sqrt(2)^2, 2)
#Because "=" is equal to "<-", we need to use the == for equalities
#Or inequalities
2 < 4
#R can also use objects that we create, like a single value
x <- 2*3
#The print function will help us out...
print(x)
#Or just...
x
#We can also create a vector of values
y <- c(2,4,7)
#The c command is probably the most common in R, it combines all the elements inside the parantheses
?c
y
#We can now work with an entire vector, rather than single values
y+2
y*x
#We can see a complete list of the objects we have saved
ls()
#We can overwrite the objects we have saved
x <- 10
x <- 20
x
#We can also create sequences of variables
x <- 1:10
x
#There are also more specific sequence functions
#To create that same vector...
x <- seq(1,10, by = 1)
x
#If we want to change the boundaries and increments
x <- seq(1, 12, by = .5)
x
#We can also repeat values a certain number of times
x <- rep(1, 10)
x
#We can repeated a vectore a particular number of times
x <- rep(c(1,4), 10)
x
#Now combining all of this...
(rep(1:10, 10) + seq(1, 50.5, .5))
#If need be, we can remove particular objects
rm(x)
#Or we can clear the whole working enivornment
rm(list=ls())
#

#There are several different data types that we can us in R, we've seen vectors, we can create matrices too
mymatrix <- matrix(c(1,2,3,4,5,6), ncol=3, nrow=2)
mymatrix
x<-2
#Still, we can work with our other objects
mymatrix+x
mymatrix*x
#We can get some of our important information from this matrix, like the dimensions:
dim(mymatrix)
#The cumulative summation
cumsum(mymatrix)
#The sums for each column...
colSums(mymatrix)
#and each row...
rowSums(mymatrix)
#We can already build and analyze a large amount of mathematical operations using what we've learned so far...

#set up working directory
getwd() 
setwd("C:\\Users\\GradQuant\\Desktop") 
getwd() #This will be where all of your packages and other R files are downloaded to


#We can import different types of data
data = read.table("hmnrghts.txt", header=TRUE) 
data= read.csv ("hmnrghts.csv", header=TRUE) 
#There is also the easy way over yonder using "Import Dataset" --->>
#R also has a huge amount of built in datasets(most of which are in specific packages)
data(mtcars)
#If we want to find out about the mtcars dataset
?mtcars
#We'll rename this "data" so that it's easier to work with
data <- mtcars

#To view the first few rows
head(data)
#To adjust the number of rows
head(data, 4)
#Now we can view just the opposite
tail(data)
tail(data, 4)
#Say we wanted to view just one column
View(mpg)
#woops! We need to be more specific about our column
View(mtcars$mpg)
#That's more like it. Alternately, we can "attach" our dataset. This will make our dataset #1 priority
attach(data) 
View(mpg)
#Suppose we want to select particular columns, say the first three columns
newdata<-data[c(1,2,3)] #Orrrrrr
newdata <- data[c(1:3)]
#ta-dah! The brackets are telling R to subset the data
newdata
#Or if we want columns that are not adjacent to each other
newdata<-data[c(1,3:6)] 
newdata
#We can also use this to drop variables with a minus sign
droppeddata <- data[c(-3,-5)] 
droppeddata
#Or a more concise way for multiple columns
newdata <- data[c(-3:-5)] 
#Alternatively, we can select certain rows the same way
newdata <- data[1:5,]
#Exercise Time!! Select all the cars with an mpg greater than 20.0, and name it "exercisedata"

#Now select cases with engine displacement over 200, and select the mpg, disp, and am columns

#base R has a decent amount of functionality, but the real power comes from it's open-sourced nature
#User-written packages greatly expand R's capabilities, but they need to be installed and loaded
 #installing external packages
install.packages("matrixStats") 
#now that it's installed, we need to load it
library(matrixStats) 
#Now that we loaded it, we can check out the source documentation
?matrixStats          
#Now let's do some basic stats on our mtcars dataset...the fun stuff!
#We can start just by finding the mean...
mean(data$mpg)
#So the average is barely over 20 mpg
#The standard deviation...
sd(data$mpg)
#Or if we want more complete information
psych::describe(data$mpg)
#Notice how we were able to use a function in a package without loading it?

#A simple t-test
#The V/S variable is aobut whether the car is a V-engine or a straight engine, IDK what that means...
#We will compare mpg based on the V/S variable
#First we need to make sure that vs is actually a factor
is.factor(data$vs)
#No! So we need to convert it
data$vs <- as.factor(data$vs)
#Let's try it again
is.factor(data$vs)
#We can first describe the data in each group..
psych::describeBy(mpg, group=vs)
#Now we can do our t-test
t.test(mpg~vs, data=data)
mtcars
#Way significant!
#We can so the same analysis in the form of a linear model
model <- lm(mpg~vs, data)
summary(model)
#Our output is telling the same things, phew!
#We can run an ANOVA now for more than two groups
#We have three cylinder groups, they aren't saved as factors, but we have a shortcut
anova <- aov(mpg~as.factor(cyl), data)
summary(anova)
#Again as a linear model
model <- lm(mpg~as.factor(cyl), data)
summary(model)
#we see again our F-value is the same, and everything. So why run it as a linear model?
#LEt's take a look at all the elements in our model
str(model)
#This looks like gibberish, but tells us the different elements that we can get
#For instances, we can see the residuals...
model$residuals
#We can check the residuals for normality using both a histogram and qqplot
qqnorm(model$residuals)
hist(model$residuals)
#Say we wanted to correlate mpg with engine displacement now
cor(mpg, disp)
#not very much output. Let's try this instead
cor.test(mpg, disp)
#Now we can make more sense of it, there's a strong negative correlation here
#One of the biggest selling points of R is the capability to graph data
plot(mpg~disp)
#Not the best of graphs, but the possibilities are endless. Let's start with better labels
plot(mpg~disp, xlab = "Engine Displacement", ylab="Miles per gallon")
#And a main title
plot(mpg~disp, xlab = "Engine Displacement", ylab="Miles per gallon", main="Sample Scatteprlot")
#We can also add a best fit line
abline(lm(mpg~disp))
#Exercise!! Find the correlation between weight (wt) and quarter mile time (qsec).
#THEN. Run a linear model, and create a scatterplot
#Let's dissect the anatomy of a function...
?lm
#All the options associated with a particular function are called "arguments"
#Some arguments have defaults that we can override
#Other don't have a default that we may have to specify
#If we have the right order, we don't need to specify the argument
lm(mpg~vs,data)
#If we get the order wrong, it returns an error
lm(data, mpg~vs)
#But we can counter it by specifying the argument
lm(data=data, formula = mpg~vs)
#Finally, we can examine the relationship between two categorical variables
#We'll first create a contingency table
table <- xtabs(~vs + am)
table
#And we can do a chi-square on this table...
chisq.test(table)
#It's as easy as that!

#Upcoming R workshops:
#Graphing in R
#Data Manipulation in r
#If you have questions in the meantime, schedule a consultation!      
          